Testing primers against PR2

1 Goal

  • Testing primers against the PR2 database (latest version 4.11.1)

2 Notes about script

  • Chunks compute_matches and store_matches have only to be run in 2 cases
    • new version of pr2
    • new set of primers
    • In the other case, they can be set eval=FALSE

2.1 To do

  • Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)

5 Read primer file

5.2 Build the primer set table 1

# primer_sets <- read_excel(path = "primers_18S.xlsx", sheet ="primer_sets")
# primers <- read_excel(path = "primers_18S.xlsx", sheet ="primers")

# Read from local database
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
primers <- tbl(pr2_db_con, "pr2_primers") %>% collect()
primer_sets_all <- tbl(pr2_db_con, "pr2_primer_sets") %>% collect()
disconnect <- db_disconnect(pr2_db_con)

if(gene_selected == "18S_rRNA") {
  gene_regions = c("V4", "V9")
  # Just keep the selected primers (V4, V9 etc..)
  primer_sets <- primer_sets_all %>% 
  filter(gene == gene_selected) %>% 
  filter(gene_region %in%  gene_regions) %>% 
  filter(!(primer_set_id %in% 63:66))
  } else{
  primer_sets <- primer_sets %>% 
  filter(specificity ==  "plastid" | (gene == "18S_rRNA" & specificity == "universal" ))
}

primer_sets <- primer_sets %>% 
  left_join(select(primers, 
                   primer_id, 
                   fwd_name=name,
                   fwd_seq=sequence, 
                   fwd_start_yeast= start_yeast, 
                   fwd_end_yeast= end_yeast), 
            by = c("fwd_id" = "primer_id")) %>% 
  left_join(select(primers, 
                 primer_id, 
                 rev_name=name,
                 rev_seq=sequence, 
                 rev_start_yeast= start_yeast, 
                 rev_end_yeast= end_yeast), 
          by = c("rev_id" = "primer_id")) %>% 
  mutate(length_yeast = rev_end_yeast - fwd_start_yeast + 1) %>% 
  select(gene_region, specificity, primer_set_id,primer_set_name, contains("fwd"), contains("rev"),length_yeast, used_in:remark) %>% 
  arrange(gene_region,  fwd_start_yeast)

write_tsv(primer_sets, path = "primers_Table_1.tsv", na = "")



knitr::kable(select(primer_sets,primer_set_id:primer_set_name, fwd_name, rev_name, length_yeast)) %>%
  kableExtra::kable_styling()
primer_set_id primer_set_name fwd_name rev_name length_yeast
40 Zhan Uni18SF Uni18SR 461
12 Geisen 3NDf 1132r 599
13 Brate1 3NDf V4_euk_R1 465
14 Brate2 3NDf V4_euk_R2 458
18 Parfrey 515F 1119r 598
33 Needham 515F Univ 926R 589
34 Lambert 515FY NSR951 391
35 UNonMet EUK581-F EUK1134-R 578
4 Hugerth2 563f 1132r 587
7 Bass V4_1f TAReukREV3 417
8 StoeckV4 TAReuk454FWD1 TAReukREV3 417
16 PireddaV4 TAReuk454FWD1 V4 18S Next.Rev 417
19 Vannini Claudia Vannini (F) Claudia Vannini (R) 439
36 StoeckV4 TAReuk454FWD1 V4RB 417
1 Hadziavdic1 F-566 R-1200 635
15 Moreno EUKAF EUKAR 410
17 Comeau E572F E1009R 438
22 Kim 528F Nex_18S_0964_R 419
26 Pawlowski 528F S12.2R 445
25 Mangot NSF563 NSR951 380
2 Hadziavdic2 A-528F R-952 379
39 Egge A-528F PRYM01+7 396
3 Hugerth1 574*f 1132r 576
23 Venter 590F 1300R 720
77 Hugerth5 574f 1132r 576
21 Zimmerman D512for D978rev 437
41 Harder Cerc479F Cerc750R 283
24 Simon EK-565F-NGS EUK1134-R 527
5 Hugerth3 616f 1132r 534
6 Hugerth4 616*f 1132r 534
28 Amaral1 1380F 1510R 176
29 Amaral2 1389F 1510R 167
31 PireddaV9 1388F 1510R 167
27 StoeckV9 1391F EukB 169

6 Computing the matches

This part is done by an R script PR2 Primers pr2_match.R that is executed in batch mode.

6.1 Programing Notes

  • Use Biostrings

Accessor methods : In the code snippets below, x is an MIndex object.

  • length(x): The number of patterns that matches are stored for.
  • names(x): The names of the patterns that matches are stored for.
  • startIndex(x): A list containing the starting positions of the matches for each pattern.
  • endIndex(x): A list containing the ending positions of the matches for each pattern.
  • elementNROWS(x): An integer vector containing the number of matches for each pattern.
  • x[[i]]: Extract the matches for the i-th pattern as an IRanges object.
  • unlist(x, recursive=TRUE, use.names=TRUE): Return all the matches in a single IRanges object. recursive and use.names are ignored.
  • extractAllMatches(subject, mindex): Return all the matches in a single XStringViews object.

One could also use another function which does not give the position * match_fwd <- vcountPattern(fwd, seq,max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=FALSE, algorithm=“auto”)

8 Summarize the data in tables

9 Amplicon length

9.2 Plot an example of amplicon distribution (Fig. 3)

10 Graphics

10.1 All Eukaryotes

Comments

  • Percent of sequences amplified
    • Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
    • In general it is the reverse primer that causes problems.
    • Some primer sets do not amplify any sequence (11, 19, 20, 21)
    • For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
  • Amplicon sizes
    • Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
    • This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
    • Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
for (one_region in gene_regions) {
  
  pr2_match_summary_primer_set_region_long <- filter(pr2_match_summary_primer_set_long, gene_region== one_region)  
  pr2_match_summary_primer_set_region <- filter(pr2_match_summary_primer_set, gene_region== one_region)
  pr2_match_region <- filter(pr2_match_final, gene_region == one_region)
  
  g <- ggplot(pr2_match_summary_primer_set_region_long) + 
    geom_col(aes(x=reorder(primer_label, primer_set_id), y=pct_seq, 
                 fill=fct_reorder(pct_category, pct_category_order)), width=.7, position = "dodge") +
    theme_bw() +
    xlab("Primer set") + ylab("% of sequences amplified") + 
    scale_fill_manual(name = "% amplified",  values = c("pct_amplicons" = "black", "pct_fwd" = "blue","pct_rev" = "red"), 
                      labels=c( "Primer fwd", "Primer rev", "Amplicons")) +
    ggtitle (str_c(one_region, " - Percentage of sequences recovered"))+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(g)


   g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + 
    geom_point(aes(x=reorder(primer_label, primer_set_id), y=ampli_size_mean), colour="black") +
    geom_errorbar(aes(x=reorder(primer_label, primer_set_id), 
                      ymax=ampli_size_mean + ampli_size_sd, 
                      ymin=ampli_size_mean - ampli_size_sd)) +
    theme_bw() +
    xlab("Primer set") + ylab("Amplicon size (bp)") +
    # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
    ggtitle (str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively") ) +
    geom_hline(yintercept = c(450,550), linetype = c(2,3) )+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
   
  print(g)
  
     g <- ggplot(pr2_match_region) + 
    geom_boxplot(aes(x=reorder(primer_label, primer_set_id), y=ampli_size), colour="black", outlier.alpha = 0.3) +
    theme_bw() +
    xlab("Primer set") + ylab("Amplicon size (bp)") +
    # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
    ggtitle (str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively") ) +
    geom_hline(yintercept = c(450,550), linetype = c(2,3) )+
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
   
  print(g)
    }

Daniel Vaulot

02 08 2019